import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
MainModule = pd.read_csv('googleplaystore.csv', index_col="App")
MainModule.head() # get First Records
MainModule.tail() #get the last records
MainModule.info() #get Information about data
#Removeing Null values from Dataset
nulls =[i for i in MainModule.isna().any().index if MainModule.isna().any()[i]==True]
rates =[]
counts =[]
for i in nulls:
rates.append((MainModule[i].isna().sum()/MainModule.shape[0])*100)
counts.append(MainModule[i].isna().sum())
null_DataFrame = pd.DataFrame.from_dict({"Col":nulls,"Count":counts,"Null_Rates":rates})
# MainModule = MainModule.drop(["Current Ver","Android Ver"],1)
null_DataFrame
#Missing Values( Data Cleaning For Null values )
#delete Type,Content Rating, Current Ver, Android Ver null values row
df_train = MainModule.copy()
for i in ['Type','Content Rating']:
df_train = df_train.drop(df_train.loc[df_train[i].isnull()].index,0)
df_train.info()
#rating in Percentage 1 to 5
df_train.Rating.describe()
#Rating should not to be up to 5
df_train =df_train[df_train["Rating"]<=5]
#1. Category
#machine learning, we are converting objects to numbers.
#get unique values in Catagory feature
df_train.Category.unique()
# convert to categorical Categority by using one hot tecnique
df_dummy = df_train.copy()
df_train.Category = pd.Categorical
(df_dummy.Category)
x = df_dummy[['Category']]
del df_dummy['Category']
dummies = pd.get_dummies(x, prefix='Category')
df_dummy = pd.concat([df_dummy,dummies], axis=1)
df_dummy.head()
#2. Genres (Types)
#Genres unique value
df_dummy["Genres"].unique()
plt.figure(figsize=(25,6))
sns.barplot(x =df_dummy.Genres.value_counts().index, y = df_dummy.Genres.value_counts())
plt.xticks(rotation = 80)
plt.title("Category And There Counts")
np.sort(df_dummy.Genres.value_counts())
#Some haves very Low Upload Counts
#So Will Classifies those who do not have a significant number of Uploads as others
Low_Uploads = []
for i in df_dummy.Genres.value_counts().index:
if df_dummy.Genres.value_counts()[i] < 20:
Low_Uploads.append(i)
print(len(Low_Uploads),"Low Uploads Count those are Less then 20")
df_dummy.Genres = ['Others' if i in Low_Uploads else i for i in df_dummy.Genres]
df_dummy.shape
#3. Contant Rating
df_dummy["Content Rating"].value_counts(dropna= False)
Data = df_dummy.copy()
Data['Content Rating'] = Data['Content Rating'].map({'Unreted': 0.0,
"Everyone":1.0,
"Everyone 10+":2.0,
'Teen':3.0,
'Adults only 18+':4.0,
'Mature 17+':5.0})
Data['Content Rating'] = Data["Content Rating"].astype(float)
Data.head()
#4.Reviews
#Change Type To Float
Data_Cpy = Data.copy()
Data_Cpy['Reviews'] = Data_Cpy['Reviews'].astype(float)
#5.Size
Data_Cpy["Size"].value_counts()
#clean 'M','k', fill 'Varies with device' with median and transform to float
lists = []
for i in Data_Cpy["Size"]:
if 'M' in i:
i = float(i.replace('M',''))
i = i *1000000
lists.append(i)
elif 'k' in i:
i = float(i.replace('k',''))
i = i*1000
lists.append(i)
else:
lists.append("Unknown")
k = pd.Series(lists)
median = k[k!="Unknown"].median()
k = [median if i =="Unknown" else i for i in k ]
Data_Cpy["Size"] = k
del k,median,lists
#clean 'M'and transform to float
print("Old", Data["Size"][10],"New",Data_Cpy["Size"][10])
Data_Cpy['Price'] = [float(i.split('$')[1]) if '$' in i else float(0) for i in Data_Cpy['Price']]
print("old: ",Data['Price'][9054]," new: ",Data_Cpy['Price'][9054])
Data_Cpy.Price.unique()
Data_Cpy.Installs.unique()
Data_Cpy["Installs"] = [ float(i.replace('+','').replace(',', '')) if '+' in i or ',' in i else float(0) for i in Data_Cpy["Installs"] ]
print("Old Values: ",Data['Installs'][0]," New Values: ",Data_Cpy['Installs'][0])
Data_Cpy.Type.unique()
Data_Cpy.Type = Data_Cpy.Type.map({'Free':0,'Paid':1})
Data_Cpy['Last Updated'][:3]
from datetime import datetime
Data_Cpy3 = Data_Cpy.copy()
Data_Cpy3["Last Updated"] = [datetime.strptime(i, '%B %d, %Y') for i in Data_Cpy3["Last Updated"]]
Data_Cpy3
Data_Cpy3 = Data_Cpy3.set_index("Last Updated")
Data_Cpy4 = Data_Cpy3.sort_index()
Data_Cpy4.head()
#Update The Final Valuable Data
Data_Cpy4.isna().any().sum()
Data = Data_Cpy4.copy()
Data.info()
#Main Analysis Part
#We Are Using EDA as Our Analysis Approach , Because it is Simple Effective and Efficient for this Data
#The EDA is Exploratory data analysis The primary goal of EDA is to maximize the analyst's insight into a data set and into the underlying #structure of a data set, while providing all of the specific items that an analyst would want to extract from a data set, such as: a #good-fitting, parsimonious model. a list of outliers.
from scipy.stats import norm
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
style = sns.color_palette("ch:2.5,-.2,dark=.3")
plt.figure(figsize=(20,5))
sns.distplot(Data['Rating'],color='red',hist_kws={"linewidth":3});
plt.title("RATING DISTRIBUTION")
plt.show()
print("Skewness :%f" % Data['Rating'].skew())
print("kurtosis :%f" % Data['Rating'].kurt())
#histogram fro Free vs Paid Apps
plt.figure(figsize=(20,5))
sns.countplot(Data['Type'],color='dodgerblue' );
plt.title("Type Distrubition , Free Vs Paid")
plt.show()
#It seems Most Of the Apps Published On Play Store Are Free :)
#histogram Which Describs the Counts Of Installed Apps
plt.figure(figsize=(20,6))
sns.barplot(x= Data['Installs'],y= Data.Reviews,color='b',palette=sns.color_palette("ch:2.5,-.2,dark=.3"));
plt.title("Installs Distrubition")
plt.xticks(rotation=80)
plt.show()
plt.figure(figsize=(20,6))
sns.barplot(x= Data['Installs'],y= Data.Rating,color='b',palette=sns.color_palette("ch:2.5,-.2,dark=.3"));
plt.title("Installs Distrubition")
plt.xticks(rotation=80)
plt.show()
#histogram Which Describs the Counts Of Installed Apps
plt.figure(figsize=(20,6))
sns.barplot(x= Data['Rating'],y= Data.Category_EDUCATION,color='b',palette=sns.color_palette("ch:2.5,-.2,dark=.3"));
plt.title("Installs Distrubition")
plt.xticks(rotation=80)
plt.show()
ax = plt.figure(figsize=(20,5))
sns.set()
sns.boxplot(x="Installs", y ="Rating", data=Data)
plt.title("Rating On Most Installs")
plt.xticks(rotation = 80)
chart_data = Data.loc[:,"Category_ART_AND_DESIGN":"Category_WEATHER"]
chart_data['Rating'] = Data["Rating"]
for i in range(0,len(chart_data.columns),5):
sns.pairplot(data = chart_data, x_vars=chart_data.columns[i:i+5],y_vars=['Rating'])
import math
#del chart_data["Rating"]
l = len(chart_data.columns.values)
r = math.ceil(l/5)
chart_data["Type"] = Data["Type"]
j=1
plt.subplots(figsize=(15,10),tight_layout=True)
for i in chart_data.columns.values:
if i=="Type":
continue
d = chart_data[chart_data[i]==1]
plt.subplot(r, 5, j)
plt.hist(d["Type"])
plt.title(i)
j +=1
plt.show()
fig,ax = plt.subplots(figsize=(8,7))
ax = sns.heatmap(Data[["Reviews","Price","Rating","Installs","Size"]].corr(), annot=True,linewidths=.5,fmt='.1f')
plt.show()